
import time
import random
import requests
import pandas as pd
from lxml import etree

# 代理浏览器
headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
        }

# 网址生成
# 城市
city = 'guangzhou'

url_root = 'http://www.tianqihoubao.com/lishi/'
month_list = ['01','02','03','04','05','06','07','08','09','10','11','12',]
url_list = [[url_root + city + '/month/' + '{}'.format(year) + '{}'.format(month) + '.html' for month in month_list] for year in range(2020,2022)]

#month_list = ['01','02','03','04','05','06','07','08','09','10','11','12',]
# url_list = [[url_root + city + '/month/' + '{}'.format(year) + '{}'.format(month) + '.html' for month in month_list] for year in range(2020,2022)]

df = pd.DataFrame(columns = ['日期', '白天天气', '夜晚天气','最高气温', '最低气温', '风向风力'])

# 爬取
for urls in url_list:
    for url in urls:
        # 读取
        page_text = requests.get(url=url, headers=headers).text
        e = etree.HTML(page_text)

        # 日期
        day_list_text = e.xpath('//div[@class="wdetail"]//tr/td/a/text()')

        # 数据
        data_list_text = e.xpath('//div[@class="wdetail"]//tr/td/text()')

        # 去空格
        day_list = [''.join(i.split()) for i in day_list_text]
        data_list_null = [''.join(i.split()) for i in data_list_text]

        # 去空值
        data_list = []
        for i in data_list_null:
            if i != '':
                data_list.append(i)

        # 分离数据(指标）
        # 天气
        weather_list = [data_list[i] for i in range(0, len(data_list), 3)]
        # 温度
        temperature_list = [data_list[i] for i in range(1, len(data_list), 3)]
        # 风向风力（白天/夜晚）
        wind_list = [data_list[i] for i in range(2, len(data_list), 3)]

        # 白天天气
        weather_list_day = [i.split('/')[0] for i in weather_list]
        # 夜晚天气
        weather_list_night = [i.split('/')[1] for i in weather_list]
        # 最高气温
        temperature_list_high = [i.split('/')[0] for i in temperature_list]
        # 最低气温
        temperature_list_low = [i.split('/')[1] for i in temperature_list]

        df_new = pd.DataFrame({'日期': day_list, '白天天气': weather_list_day, '夜晚天气': weather_list_night,
                               '最高气温': temperature_list_high, '最低气温': temperature_list_low, '风向风力': wind_list})

        print(df_new)

        df = pd.concat([df, df_new], axis=0)

        time.sleep(random.randint(1, 2))


df.to_excel(city + ".xlsx")

print(city + "天气数据爬取完成")


